import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import seaborn as sb
What factors affect loan’s amount ?
df=pd.read_csv('prosperLoanData.csv')
df.info()
#check for null values
obj = df.isnull().sum()
for key,value in obj.iteritems():
print(key,",",value)
I want to know what factors affect a loan’s outcome status. For that my target variable will be LoanStatus. For this I do not want to choose the variables randomly. I wanto apply a correlation and choose the variables that best desribe it.
loan_data=df[['BorrowerAPR','LoanStatus','BorrowerRate','AvailableBankcardCredit','LoanOriginalAmount','Investors','Occupation',
'EmploymentStatus','EmploymentStatusDuration','ProsperRating (numeric)','ProsperRating (Alpha)',
"Term",'Recommendations',"IncomeRange",'MonthlyLoanPayment','ListingCreationDate','ClosedDate','DebtToIncomeRatio']]
loan_data.isnull().sum()
#check for duplicate values
sum(loan_data.duplicated())
loan_data.shape
loan_data
loan_data=loan_data.dropna(subset=['BorrowerAPR'])
loan_data.info()
loan_data=loan_data.dropna(subset=['AvailableBankcardCredit'])
#drop the null values
loan_data['Occupation'] = loan_data['Occupation'].fillna('Other',inplace =False)
# fill na values and call them OTHER
I wil also change the values of occupation because they are so many and have the same values
loan_data["Occupation"]=loan_data["Occupation"].replace(to_replace =["Student - College Graduate Student","Student - College Junior",'Student - College Senior',"Student - College Junior","Student - College Freshman",'Student - Technical School',"Student - Community College","Student - College Freshman","Student - Technical School","Student - College Sophomore"],
value ="Student")
#merge all students together
a=loan_data["Occupation"].unique()
sorted(a)
loan_data["Occupation"]=loan_data["Occupation"].replace(to_replace =['Engineer - Chemical',
'Engineer - Electrical',
'Engineer - Mechanical','Military Enlisted'], value ="Engineer")
#merge all engineers together
loan_data["Occupation"]=loan_data["Occupation"].replace(to_replace =['Tradesman - Carpenter',
'Tradesman - Electrician',
'Tradesman - Mechanic',
'Tradesman - Plumber'], value ="Tradesman")
#merge all tradesman together
loan_data["Occupation"]=loan_data["Occupation"].replace(to_replace =['Food Service',
'Food Service Management'], value ="Food service")
#merge all tradesman together
loan_data["Occupation"]=loan_data["Occupation"].replace(to_replace=["Nurse (LPN)","Nurse (RN)", "Nurse's Aide"],value="Nurse")
#merge nurses
loan_data['ClosedDate'] = loan_data['ClosedDate'].fillna('Current',inplace =False)
#we change the nul values with current as the ones which are not closed are current
loan_data["Occupation"]=loan_data["Occupation"].replace(to_replace=["Dentist"],value="Doctor")
loan_data["Occupation"]=loan_data["Occupation"].replace(to_replace=["Unknown"],value="Other")
loan_data["Occupation"]=loan_data["Occupation"].replace(to_replace=["Professor","Teachers Aide"],value="Teacher")
loan_data["Occupation"]=loan_data["Occupation"].replace(to_replace=["Sales - Commission","Retail Management","Sales - Retail","Realtor"],value="Sales")
loan_data.isnull().sum()
loan_data['ListingCreationDate'] = loan_data['ListingCreationDate'].astype('datetime64[ns]')
loan_data['start_year'] = pd.DatetimeIndex(loan_data['ListingCreationDate']).year
#i need to change the type of this column into datetype as it is a date and then get the year out of it
loan_data['start_month'] = pd.DatetimeIndex(loan_data['ListingCreationDate']).month
#to get the month out of that column
loan_data['start_month']=loan_data['start_month'].apply(str)
loan_data['start_year']=loan_data['start_year'].apply(str)
del loan_data['ListingCreationDate']
#change the month and year columns into string and delete the creation date column
loan_data.info()
loan_data=loan_data.dropna(subset=['EmploymentStatus'])
#drop null values
loan_data=loan_data.dropna(subset=['DebtToIncomeRatio'])
#drop null values
loan_data.isnull().sum()
loan_data=loan_data.dropna(subset=['EmploymentStatusDuration'])
#drop null values
loan_data=loan_data.drop(columns="ProsperRating (numeric)")
#drop null values
loan_data['ProsperRating (Alpha)'] = loan_data['ProsperRating (Alpha)'].fillna('Unknown',inplace =False)
#fill the numm values with unknown because they are so many we can not dleete them
loan_data.isnull().sum()
#check if cleaning was done right
#count plot for Distribution of Loan status
plt.figure(figsize=(13,9))
loan_data['LoanStatus'].value_counts().plot(kind='bar')
plt.title('Distribution of Loan status')
We see that most of the loans are current they are not closed yet. so plus the second highest loans are the ones which have been completed.
#Find out the frequency count for IncomeRange
count=loan_data['IncomeRange'].value_counts()
count
plt.figure(figsize=(13,9))
ax = sns.countplot(x="IncomeRange", data=loan_data)
plt.title('Distribution of Income Rnage')
Here we see that the people taking loans mostly have the range of their income is between 25-49000$ the greater their income the less loan they take. Poeple who do not work are almost not given loans.
plt.figure(figsize=(22,9))
loan_data['Occupation'].value_counts().sort_index().plot.bar()
plt.title('Distribution of Occupation')
plt.show()
This graph shows that the most occured occupations are professionals teacher, executives, computer programmers
#after i want to see from the biggest value count to the smallest
plt.figure(figsize=(22,9))
(loan_data['Occupation'].value_counts().plot.bar())
(loan_data['Occupation'].value_counts().tail(10).plot.bar())
plt.title('Occupations with the lowest counts')
#see the smallest counts of occupations
The least loans were taken by judges
loan_data['LoanOriginalAmount']
plt.figure(figsize=(22,10))
sns.distplot(loan_data["MonthlyLoanPayment"])
plt.title('Distribution of MonthlyLoanPayment')
plt.ylabel('Count')
plt.show()
We see that the distribution is normal and right skewed. It has one evident peak around 130. The range is between 0 to 1000.
sns.distplot(loan_data['LoanOriginalAmount'], rug=True)
plt.title('Distribution of LoanOriginalAmount')
plt.ylabel('Count')
plt.show()
It has several peaks.
sns.distplot(loan_data['start_year'], rug=True)
plt.ylabel('Count')
plt.show()
The most loans were given in 2013. The loans amount were given more by more after each year till 2014 when they have
sns.distplot(loan_data['start_month'], rug=True)
plt.ylabel('Count')
plt.show()
The most loans were given in january and october and the least in April
sns.distplot(loan_data['BorrowerRate'], rug=True)
plt.ylabel('Count')
plt.show()
This is a normal distribution we can see that the most values lie around the 0.15 and there is a peak between 0.30 to 0.34.
sizes = loan_data['ProsperRating (Alpha)'].value_counts()
sizes
Distribution of ProsperRating (Alpha)
plt.figure(figsize=(50,20))
count_alpha= loan_data["ProsperRating (Alpha)"].value_counts()
label=["Unknown","C","B","A","D","E","HR","AA"]
fig1, ax1 = plt.subplots()
plt.figure(figsize=(20,20))
ax1.pie(count_alpha, autopct='%1.1f%%',shadow=True, startangle=90,labels=label)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
What comes to the prosper rating we see that unkown rates have the most share. The second biggest proportion loans have been given a rate of 'C', and then 'B'. 'AA' has been given to very few loans only to 5.1%.
loan_data.info()
#correlation plot to find out the correlations between the variables
plt.figure(figsize = [13, 10])
sns.heatmap(loan_data.corr(), annot = True, fmt = '.3f', cmap = 'vlag_r', center = 0)
plt.title('Correlation of numerical variables')
plt.show()
This shows that Loan original amount is correlated onegatively to borrower apr and postively to monthly loan payment and to investors
sns.FacetGrid(loan_data,size=5).map(plt.scatter,"LoanOriginalAmount",'MonthlyLoanPayment').add_legend()
plt.title('LoanOriginalAmount vs MonthlyLoanPayment')
plt.show()
Loan amount and montly loan payments are positievly related.
loan_data.info()
categories=['LoanStatus','Occupation','EmploymentStatus',' ProsperRating (Alpha)','IncomeRange']
numerical=['BorrowerAPR','BorrowerRate','AvailableBankcardCredit','LoanOriginalAmount','Investors',' EmploymentStatusDuration',
'Term',' Recommendations','MonthlyLoanPayment','InvestmentFromFriendsAmount','InvestmentFromFriendsCount']
datalim=loan_data[:500] # to have less data points
plt.scatter(datalim["LoanOriginalAmount"],datalim["Investors"])
plt.ylabel('Investors')
plt.xlabel('LoanOriginalAmount')
plt.title('Investors vsLoanOriginalAmount ')
plt.show()
30000 and 35000 loan amounts are outliers other than that these two variables are positevely correlated though not so much
#cross plots
plt.figure(figsize=(14, 14))
sns.pairplot(loan_data, diag_kind='kde');
def boxgrid(x, y, **kwargs):
""" Quick hack for creating box plots with seaborn's PairGrid. """
default_color = sns.color_palette()[0]
sns.boxplot(x, y, color = default_color)
plt.figure(figsize = [10, 10])
g = sns.PairGrid(data = loan_data, y_vars = ['BorrowerAPR'],
x_vars = ['Term','ProsperRating (Alpha)', 'EmploymentStatus'], height = 4, aspect = 2)
g.map(boxgrid)
plt.show();
plt.figure(figsize = [10, 10])
g = sns.PairGrid(data = loan_data, y_vars = ['LoanOriginalAmount'],
x_vars = ['Term','ProsperRating (Alpha)', 'EmploymentStatus'], height = 4, aspect = 2)
g.map(boxgrid)
plt.title('LoanOriginalAmount vs Term ,ProsperRating (Alpha), EmploymentStatus')
plt.show();
n = 10
most_occupation=loan_data['Occupation'].value_counts()[:n]
most_occupation
loan_data['Occupation'].value_counts().head(10)
'''plt.figure(figsize = [10, 23])
sb.countplot(data = loan_data, x = loan_data['Occupation'].value_counts().head(10),y = 'LoanStatus', palette = 'Greens')
plt.xticks(rotation = 15);'''
plt.figure(figsize=[10, 10])
sb.countplot(y='LoanStatus', hue='Term', data=loan_data)
plt.title('LoanStatus by hue')
plt.show()
So the loan for 36 months has mostly current status and secondly it has completed status. 60 months loans are having mostly current status and the second highest number of 60 months loans has been completed already.
plt.figure(figsize=[10, 10])
sb.countplot(y='LoanStatus', hue='start_year', data=loan_data)
plt.title('LoanStatus by start year')
plt.show()
Current Loans have been mostly given in the year of the 2013.And for the completed loans they have been given mostly in 2008.
plt.figure(figsize=[10, 10])
sb.countplot(y='LoanStatus', hue='ProsperRating (Alpha)', data=loan_data)
plt.title('loan status by ProsperRating (Alpha) ')
plt.show()
Most of the current loans were given a C rating and secondly a B rating.For the completed loans most of the ratings are unknown, secondly they have been given D rating.
plt.figure(figsize = [20, 20])
plt.scatter(data = loan_data, x = 'LoanStatus', y = 'LoanOriginalAmount', alpha = 1/10)
plt.title('LoanStatus vs LoanOriginalAmount')
plt.show()
Current has larger loan amount >35K.most of the loans which are defaulted have loan amount >25K; I assume that the current loan given have higher amounts than the completed ones.
plt.figure(figsize=[10, 10])
sb.barplot(y='LoanOriginalAmount', x='ProsperRating (Alpha)', data=loan_data)
plt.title('LoanOriginalAmount vs ProsperRating (Alpha)')
plt.show()
The maximum loan amounts were given to A B and to AA. HR and E have the lowest amount
plt.figure(figsize = [10, 10])
plt.scatter(data = loan_data, y = 'AvailableBankcardCredit', x = 'LoanOriginalAmount')
plt.title('AvailableBankcardCredit vs LoanOriginalAmount')
plt.ylabel('AvailableBankcardCredit')
plt.xlabel('LoanOriginalAmount')
plt.show()
The people who have less credit amount take more loans.
plt.figure(figsize = [10, 10])
plt.scatter(data = loan_data, y = 'BorrowerAPR', x = 'LoanOriginalAmount')
plt.title('BorrowerAPR vs LoanOriginalAmount')
plt.ylabel('BorrowerAPR')
plt.xlabel('LoanOriginalAmount')
plt.show()
The highest APR is for the lowest loan amount
plt.figure(figsize = [10, 10])
plt.scatter(data = loan_data, y = 'BorrowerRate', x = 'LoanOriginalAmount')
plt.title('BorrowerRate vs LoanOriginalAmount')
plt.show()
There is a similar impact of the BorrowerRate and borrowerAPR on the loan amount. They both are higher when the loan amount is lower.
plt.figure(figsize = [15, 8])
# subplot 1: color vs cut
sb.countplot(data = loan_data, x = 'EmploymentStatus', hue = 'LoanStatus', palette = 'Reds')
plt.title('EmploymentStatus by LoanStatus')
plt.show()
Most of the Emplyed people have current loan status and the second largest staus is the completed. So we can conclude that Employed people have less non paid loans. Most of the loans completed are by full_time eployees. the other were given less loans or not given at all.
plt.figure(figsize = [15, 8])
# subplot 1: color vs cut
sb.barplot(data = loan_data, y = 'LoanOriginalAmount', x = 'EmploymentStatus')
plt.title('LoanOriginalAmount vs EmploymentStatus')
plt.show()
The people who are employed get the highest loan amounts. Self employeed people get the second highest loan amounts. Part time workers get the lowest loans still I am surprised that not employeed people get more loans than part-time workers.
sns.FacetGrid(loan_data,hue="LoanStatus",size=5).map(plt.scatter,"LoanOriginalAmount",'IncomeRange').add_legend()
plt.title('LoanOriginalAmount vs IncomeRange by LoanStatus')
plt.show()
loan_data.info()
g = sb.FacetGrid(data = loan_data, col = 'Term',palette = 'colorblind',size=10)
g.map(sb.regplot, 'EmploymentStatusDuration', 'LoanOriginalAmount',x_jitter=0.04, scatter_kws={'alpha':0.1})
plt.title('EmploymentStatusDuration vs LoanOriginalAmount by Term')
plt.show()
g = sb.FacetGrid(data = df, hue = 'EmploymentStatus', size = 10,
palette = 'colorblind',aspect=2)
g.map(plt.scatter, 'EmploymentStatusDuration', 'LoanOriginalAmount')
plt.title('EmploymentStatusDuration vs LoanOriginalAmount by EmploymentStatus')
g.add_legend()
loan_data['Term']=loan_data['Term'].apply(str)
plt.figure(figsize = [30, 10])
ax = sb.pointplot(data = loan_data, x = 'LoanStatus', y = 'LoanOriginalAmount', hue = 'Term',
dodge = 0.3, linestyles = "")
plt.title('LoanStatus vs LoanOriginalAmount by Term')
plt.figure(figsize = [30, 10])
ax = sb.barplot(data = loan_data, x = 'EmploymentStatus', y = 'LoanOriginalAmount', hue = 'Term')
ax.legend(loc = 8, ncol = 3, framealpha = 1, title = 'LoanOriginalAmount')
plt.title('EmploymentStatus vs LoanOriginalAmount by Term')
plt.show()
#loan_data.to_csv('clean_loan.csv')
'''from google.colab import files
files.download('clean_loan.csv')'''